Methods
Exploratory Data Analysis (EDA)
Inspect Data
head(london, 5)
## realSum room_type room_shared room_private person_capacity
## 1 121.1223 Private room False True 2
## 2 195.9124 Private room False True 2
## 3 193.3253 Private room False True 3
## 4 180.3899 Private room False True 2
## 5 405.7010 Entire home/apt False False 3
## host_is_superhost multi biz cleanliness_rating guest_satisfaction_overall
## 1 False 0 0 6 69
## 2 False 1 0 10 96
## 3 False 1 0 10 95
## 4 False 1 0 9 87
## 5 False 0 1 7 65
## bedrooms dist metro_dist attr_index attr_index_norm rest_index
## 1 1 5.734117 0.4370940 222.8822 15.49341 470.0885
## 2 1 4.788905 1.4640505 235.3858 16.36259 530.1335
## 3 1 4.596677 0.4503062 268.9138 18.69325 548.9876
## 4 1 2.054769 0.1326705 472.3813 32.83707 1021.2711
## 5 0 4.491277 0.3541075 318.4915 22.13958 692.7754
## rest_index_norm lng lat
## 1 8.413765 -0.04975 51.52570
## 2 9.488466 -0.08475 51.54210
## 3 9.825922 -0.14585 51.54802
## 4 18.278973 -0.10611 51.52108
## 5 12.399473 -0.18797 51.49399
The structure of the dataset
str(london)
## 'data.frame': 5379 obs. of 19 variables:
## $ realSum : num 121 196 193 180 406 ...
## $ room_type : chr "Private room" "Private room" "Private room" "Private room" ...
## $ room_shared : chr "False" "False" "False" "False" ...
## $ room_private : chr "True" "True" "True" "True" ...
## $ person_capacity : num 2 2 3 2 3 2 2 2 4 2 ...
## $ host_is_superhost : chr "False" "False" "False" "False" ...
## $ multi : int 0 1 1 1 0 0 0 0 0 1 ...
## $ biz : int 0 0 0 0 1 1 1 1 1 0 ...
## $ cleanliness_rating : num 6 10 10 9 7 9 10 9 9 10 ...
## $ guest_satisfaction_overall: num 69 96 95 87 65 93 97 88 87 97 ...
## $ bedrooms : int 1 1 1 1 0 0 1 1 1 1 ...
## $ dist : num 5.73 4.79 4.6 2.05 4.49 ...
## $ metro_dist : num 0.437 1.464 0.45 0.133 0.354 ...
## $ attr_index : num 223 235 269 472 318 ...
## $ attr_index_norm : num 15.5 16.4 18.7 32.8 22.1 ...
## $ rest_index : num 470 530 549 1021 693 ...
## $ rest_index_norm : num 8.41 9.49 9.83 18.28 12.4 ...
## $ lng : num -0.0498 -0.0848 -0.1459 -0.1061 -0.188 ...
## $ lat : num 51.5 51.5 51.5 51.5 51.5 ...
The summary statistics
summary(london)
## realSum room_type room_shared room_private
## Min. : 54.33 Length:5379 Length:5379 Length:5379
## 1st Qu.: 174.51 Class :character Class :character Class :character
## Median : 268.12 Mode :character Mode :character Mode :character
## Mean : 364.39
## 3rd Qu.: 438.27
## Max. :12937.27
## person_capacity host_is_superhost multi biz
## Min. :2.000 Length:5379 Min. :0.0000 Min. :0.0000
## 1st Qu.:2.000 Class :character 1st Qu.:0.0000 1st Qu.:0.0000
## Median :2.000 Mode :character Median :0.0000 Median :0.0000
## Mean :2.858 Mean :0.2798 Mean :0.3579
## 3rd Qu.:4.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :6.000 Max. :1.0000 Max. :1.0000
## cleanliness_rating guest_satisfaction_overall bedrooms
## Min. : 2.000 Min. : 20.00 Min. :0.000
## 1st Qu.: 9.000 1st Qu.: 87.00 1st Qu.:1.000
## Median :10.000 Median : 94.00 Median :1.000
## Mean : 9.194 Mean : 90.92 Mean :1.133
## 3rd Qu.:10.000 3rd Qu.: 99.00 3rd Qu.:1.000
## Max. :10.000 Max. :100.00 Max. :8.000
## dist metro_dist attr_index attr_index_norm
## Min. : 0.04056 Min. :0.01388 Min. : 68.74 Min. : 4.778
## 1st Qu.: 3.54568 1st Qu.:0.32404 1st Qu.: 177.22 1st Qu.: 12.320
## Median : 4.93914 Median :0.53613 Median : 247.65 Median : 17.215
## Mean : 5.32762 Mean :1.01653 Mean : 294.58 Mean : 20.477
## 3rd Qu.: 6.83807 3rd Qu.:1.09076 3rd Qu.: 361.07 3rd Qu.: 25.099
## Max. :17.32120 Max. :9.17409 Max. :1438.56 Max. :100.000
## rest_index rest_index_norm lng lat
## Min. : 140.5 Min. : 2.515 Min. :-0.25170 Min. :51.41
## 1st Qu.: 382.1 1st Qu.: 6.839 1st Qu.:-0.16996 1st Qu.:51.49
## Median : 527.3 Median : 9.439 Median :-0.11813 Median :51.51
## Mean : 625.6 Mean : 11.197 Mean :-0.11478 Mean :51.50
## 3rd Qu.: 764.2 3rd Qu.: 13.678 3rd Qu.:-0.06772 3rd Qu.:51.53
## Max. :5587.1 Max. :100.000 Max. : 0.12018 Max. :51.58
Handle Missing Data
colSums(is.na(london))
## realSum room_type
## 0 0
## room_shared room_private
## 0 0
## person_capacity host_is_superhost
## 0 0
## multi biz
## 0 0
## cleanliness_rating guest_satisfaction_overall
## 0 0
## bedrooms dist
## 0 0
## metro_dist attr_index
## 0 0
## attr_index_norm rest_index
## 0 0
## rest_index_norm lng
## 0 0
## lat
## 0
Numerical/Categorical variables
target = 'realSum'
numerical_vars = c('dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat')
categorical_vars = names(london)[!names(london) %in% union(numerical_vars, target)]
Combining categories
london$cleanliness_rating[london$cleanliness_rating %in% 1:6] = 6
london$guest_satisfaction_overall[london$guest_satisfaction_overall %in% 1:79] = 79
london$guest_satisfaction_overall[london$guest_satisfaction_overall %in% 80:89] = 89
london$guest_satisfaction_overall[london$guest_satisfaction_overall %in% 90:95] = 95
london$guest_satisfaction_overall[london$guest_satisfaction_overall %in% 96:99] = 99
london$guest_satisfaction_overall[london$guest_satisfaction_overall %in% 100] = 100
Identify Outliers
# Interquartile Range method (IQR)
get_outliers_idx = function(data, method = "iqr", threshold = 1) {
if (method == "iqr") {
Q1 = quantile(data, 0.25)
Q3 = quantile(data, 0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
idx = which(data < lower_bound | data > upper_bound)
}
else if (method == "z-score") {
z_scores = abs(scale(data))
idx = which(z_scores <= threshold)
}
else {
stop("method not found")
}
return(idx)
}
Remove outliers
idx = c()
for (col in c(numerical_vars, target)) {
idx = union(idx, get_outliers_idx(london[[col]]))
}
london = london[-idx,]
Data Visualization: numerical variables
# par(mfrow = c(1, length(numerical_vars)))
for (col in numerical_vars) {
hist(london[[col]], main = col, xlab = "Value", ylab = "Frequency", col = "steelblue")
}








The relationship between numerical variables
Pairplots
pairs(london[numerical_vars], col = "steelblue")

Correlation
round(cor(london[numerical_vars]), 2)
## dist metro_dist attr_index attr_index_norm rest_index
## dist 1.00 0.38 -0.90 -0.90 -0.89
## metro_dist 0.38 1.00 -0.41 -0.41 -0.46
## attr_index -0.90 -0.41 1.00 1.00 0.93
## attr_index_norm -0.90 -0.41 1.00 1.00 0.93
## rest_index -0.89 -0.46 0.93 0.93 1.00
## rest_index_norm -0.89 -0.46 0.93 0.93 1.00
## lng 0.02 0.27 0.04 0.04 -0.03
## lat -0.26 -0.17 0.17 0.17 0.26
## rest_index_norm lng lat
## dist -0.89 0.02 -0.26
## metro_dist -0.46 0.27 -0.17
## attr_index 0.93 0.04 0.17
## attr_index_norm 0.93 0.04 0.17
## rest_index 1.00 -0.03 0.26
## rest_index_norm 1.00 -0.03 0.26
## lng -0.03 1.00 0.27
## lat 0.26 0.27 1.00
library(corrplot)
## corrplot 0.92 loaded
cor_mx = cor(london[numerical_vars])
corrplot(cor_mx, method = "color")

Data Visualization: Categorical variables
for (col in categorical_vars) {
boxplot(realSum ~ london[[col]], data = london,
main = paste0("realSum vs ", col),
xlab = col,
ylab = "Real Sum",
col = "steelblue")
}










Unique values
for (cat_var in categorical_vars) {
print(unique(london[[cat_var]]))
}
## [1] "Private room" "Entire home/apt" "Shared room"
## [1] "False" "True"
## [1] "True" "False"
## [1] 2 3 4 6 5
## [1] "False" "True"
## [1] 0 1
## [1] 0 1
## [1] 6 10 9 7 8
## [1] 79 99 95 89 100
## [1] 1 0 2 3 5 4
Transform to factor variables
london[categorical_vars] = lapply(london[categorical_vars], as.factor)
Encoding categorical variables
london$room_shared = ifelse(london$room_shared == "True", 1, 0)
london$room_private = ifelse(london$room_private == "True", 1, 0)
london$host_is_superhost = ifelse(london$host_is_superhost == "True", 1, 0)
london$room_type = as.numeric(london$room_type)
Exclude collinear predictors
cor(london$attr_index, london$attr_index_norm)
## [1] 1
cor(london$rest_index, london$rest_index_norm)
## [1] 1
london = subset(london, select = -c(attr_index, rest_index, room_shared, room_private))
Main Effects Analysis
Performs diagnostic calculations for linear models
diag = function(model, name){
resid = fitted(model) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(model)$r.squared
coefs = nrow(summary(model)$coeff)
data.frame(model = name, rmse = rmse, r2 = r2, coefficients = coefs )
}
Fit a full additive model
full_additive_mod = lm(realSum ~ ., data = london)
summary(full_additive_mod)
##
## Call:
## lm(formula = realSum ~ ., data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -288.96 -59.08 -13.68 40.08 633.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3381.1510 2978.3296 1.135 0.256335
## room_type -155.4010 3.7376 -41.577 < 2e-16 ***
## person_capacity3 24.9322 5.3273 4.680 2.96e-06 ***
## person_capacity4 63.0581 4.7714 13.216 < 2e-16 ***
## person_capacity5 67.9044 9.2153 7.369 2.06e-13 ***
## person_capacity6 124.2810 8.4745 14.665 < 2e-16 ***
## host_is_superhost 7.2442 4.4041 1.645 0.100070
## multi1 9.4659 3.7594 2.518 0.011842 *
## biz1 19.1805 3.9917 4.805 1.60e-06 ***
## cleanliness_rating7 -35.0173 12.1016 -2.894 0.003828 **
## cleanliness_rating8 -24.0712 10.2157 -2.356 0.018504 *
## cleanliness_rating9 -11.5472 10.3914 -1.111 0.266538
## cleanliness_rating10 13.7363 10.6878 1.285 0.198780
## guest_satisfaction_overall89 -14.1615 6.3229 -2.240 0.025162 *
## guest_satisfaction_overall95 -2.6760 7.0305 -0.381 0.703497
## guest_satisfaction_overall99 -2.2187 7.9615 -0.279 0.780507
## guest_satisfaction_overall100 13.3244 7.6002 1.753 0.079648 .
## bedrooms1 50.1649 5.7977 8.653 < 2e-16 ***
## bedrooms2 154.6249 7.7522 19.946 < 2e-16 ***
## bedrooms3 176.4219 14.8906 11.848 < 2e-16 ***
## bedrooms4 107.6973 36.3518 2.963 0.003067 **
## bedrooms5 70.5764 94.5533 0.746 0.455456
## dist 6.3820 1.9494 3.274 0.001069 **
## metro_dist -12.0505 3.5880 -3.359 0.000791 ***
## attr_index_norm 1.3816 0.6129 2.254 0.024222 *
## rest_index_norm 10.1527 1.1397 8.909 < 2e-16 ***
## lng -151.7518 28.3122 -5.360 8.77e-08 ***
## lat -60.1191 57.7076 -1.042 0.297570
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94.25 on 4214 degrees of freedom
## Multiple R-squared: 0.6744, Adjusted R-squared: 0.6723
## F-statistic: 323.3 on 27 and 4214 DF, p-value: < 2.2e-16
linear_exp_results = diag(full_additive_mod, "full_additive_mod")
linear_exp_results
## model rmse r2 coefficients
## 1 full_additive_mod 93.93584 0.6744026 28
Fitting an additive model with no interactions to the data yields an
\(R^2 = 0.6744026\), and a \(RMSE\) of 93.93584. We will attempt to
improve on this.
We will use AIC to determine if any of the predictors can be
dropped.
aic_full_additive_mod = step(full_additive_mod, direction = "backward", trace = FALSE)
summary(aic_full_additive_mod)
##
## Call:
## lm(formula = realSum ~ room_type + person_capacity + host_is_superhost +
## multi + biz + cleanliness_rating + guest_satisfaction_overall +
## bedrooms + dist + metro_dist + attr_index_norm + rest_index_norm +
## lng, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -289.82 -58.85 -13.66 40.09 635.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 278.4594 23.4605 11.869 < 2e-16 ***
## room_type -155.1896 3.7322 -41.582 < 2e-16 ***
## person_capacity3 24.8435 5.3267 4.664 3.20e-06 ***
## person_capacity4 62.9701 4.7707 13.199 < 2e-16 ***
## person_capacity5 67.7933 9.2147 7.357 2.25e-13 ***
## person_capacity6 124.3151 8.4745 14.669 < 2e-16 ***
## host_is_superhost 7.2056 4.4040 1.636 0.101883
## multi1 9.4140 3.7591 2.504 0.012307 *
## biz1 19.2866 3.9905 4.833 1.39e-06 ***
## cleanliness_rating7 -35.0345 12.1017 -2.895 0.003811 **
## cleanliness_rating8 -24.0373 10.2158 -2.353 0.018670 *
## cleanliness_rating9 -11.5115 10.3915 -1.108 0.268021
## cleanliness_rating10 13.9768 10.6854 1.308 0.190934
## guest_satisfaction_overall89 -14.1574 6.3230 -2.239 0.025205 *
## guest_satisfaction_overall95 -2.9137 7.0269 -0.415 0.678415
## guest_satisfaction_overall99 -2.4253 7.9591 -0.305 0.760598
## guest_satisfaction_overall100 13.0175 7.5946 1.714 0.086593 .
## bedrooms1 50.2044 5.7976 8.659 < 2e-16 ***
## bedrooms2 154.7353 7.7515 19.962 < 2e-16 ***
## bedrooms3 176.5603 14.8902 11.858 < 2e-16 ***
## bedrooms4 107.1389 36.3482 2.948 0.003220 **
## bedrooms5 70.2324 94.5537 0.743 0.457657
## dist 6.8714 1.8919 3.632 0.000285 ***
## metro_dist -11.3396 3.5226 -3.219 0.001296 **
## attr_index_norm 1.6071 0.5734 2.803 0.005086 **
## rest_index_norm 9.8638 1.1054 8.923 < 2e-16 ***
## lng -163.7628 25.8586 -6.333 2.66e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94.25 on 4215 degrees of freedom
## Multiple R-squared: 0.6743, Adjusted R-squared: 0.6723
## F-statistic: 335.7 on 26 and 4215 DF, p-value: < 2.2e-16
row = diag(aic_full_additive_mod, "aic_full_additive_mod")
row
## model rmse r2 coefficients
## 1 aic_full_additive_mod 93.94794 0.6743188 27
linear_exp_results = rbind(linear_exp_results, row)
AIC eliminates just 1 parameter. The \(R^2\) value of this model left after AIC
elimination is almost the same at 0.6743188 as compared to the full
additive model with an \(R^2\) of
0.6744026. \(RMSE\) is about the
same.
We will now try BIC to see if this results in a smaller model.
bic_full_additive_mod = step(full_additive_mod, direction = "backward", k = log(nrow(london)), trace = FALSE)
summary(bic_full_additive_mod)
##
## Call:
## lm(formula = realSum ~ room_type + person_capacity + biz + cleanliness_rating +
## bedrooms + metro_dist + rest_index_norm + lng, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -298.65 -59.18 -14.84 39.85 620.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 350.5486 13.6785 25.628 < 2e-16 ***
## room_type -154.0574 3.6957 -41.685 < 2e-16 ***
## person_capacity3 24.4040 5.3282 4.580 4.78e-06 ***
## person_capacity4 62.5354 4.7697 13.111 < 2e-16 ***
## person_capacity5 68.0410 9.2314 7.371 2.03e-13 ***
## person_capacity6 123.9125 8.4716 14.627 < 2e-16 ***
## biz1 13.1049 3.3915 3.864 0.000113 ***
## cleanliness_rating7 -39.7450 12.0190 -3.307 0.000951 ***
## cleanliness_rating8 -30.5394 9.4956 -3.216 0.001309 **
## cleanliness_rating9 -16.3915 9.0220 -1.817 0.069314 .
## cleanliness_rating10 18.9364 8.9279 2.121 0.033977 *
## bedrooms1 49.1771 5.7925 8.490 < 2e-16 ***
## bedrooms2 154.0982 7.7536 19.874 < 2e-16 ***
## bedrooms3 176.8116 14.9111 11.858 < 2e-16 ***
## bedrooms4 105.3139 36.4856 2.886 0.003916 **
## bedrooms5 60.8382 94.9652 0.641 0.521794
## metro_dist -11.3768 3.5201 -3.232 0.001239 **
## rest_index_norm 9.7773 0.4369 22.379 < 2e-16 ***
## lng -153.4795 25.3475 -6.055 1.53e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94.73 on 4223 degrees of freedom
## Multiple R-squared: 0.6704, Adjusted R-squared: 0.669
## F-statistic: 477.1 on 18 and 4223 DF, p-value: < 2.2e-16
row = diag(bic_full_additive_mod, "bic_full_additive_model")
row
## model rmse r2 coefficients
## 1 bic_full_additive_model 94.51618 0.6703671 19
linear_exp_results = rbind(linear_exp_results, row)
BIC, as we would expect, produces a smaller model.
As there is little difference between these models, we will continue
the analysis with the full additive model. In summary, no one simple
additive model stands out.
Collinearity Analysis
Although collinearity likely will not affect prediction, it will
affect our ability to perform inference tests.
Let us check for collinearity using the variance inflation
factor.
Calculate VIF
library(faraway)
vif(full_additive_mod)
## room_type person_capacity3
## 1.700250 1.113204
## person_capacity4 person_capacity5
## 1.641293 1.249581
## person_capacity6 host_is_superhost
## 1.758753 1.280673
## multi1 biz1
## 1.351718 1.769287
## cleanliness_rating7 cleanliness_rating8
## 2.062160 6.001640
## cleanliness_rating9 cleanliness_rating10
## 11.016505 13.634101
## guest_satisfaction_overall89 guest_satisfaction_overall95
## 3.406338 4.430120
## guest_satisfaction_overall99 guest_satisfaction_overall100
## 4.729122 4.808630
## bedrooms1 bedrooms2
## 2.781491 3.122528
## bedrooms3 bedrooms4
## 1.428022 1.039663
## bedrooms5 dist
## 1.006265 6.559193
## metro_dist attr_index_norm
## 1.492643 10.780474
## rest_index_norm lng
## 9.348099 1.418359
## lat
## 1.397860
names(vif(full_additive_mod)[unname(vif(full_additive_mod)) > 5])
## [1] "cleanliness_rating8" "cleanliness_rating9" "cleanliness_rating10"
## [4] "dist" "attr_index_norm" "rest_index_norm"
Per the textbook, predictors with a VIF of more than 5 are suspicious
for collinearity. The variables with VIF values more than 5 include
dist, cleanliness_rating 8 9 10,
attr_index_norm, rest_index_norm. This makes
sense as dist is the distance to the city center, and it is
likely that if this is small, the number of attractions and restaurants
(as measured by attr_index_norm and
rest_index_norm) would be high. There are more attractions
and restaurants near the city center. These values are not tremendously
larger than 5, so we will keep them in model.
Transformation Analysis
What does the distribution of the room prices look like?
hist(london$realSum,
main = "Distribution of realSum",
xlab = "realSum", col = "steelblue")

Maybe this would look better if we took the logarithm. This is a
typical transformation for prices that can vary over several orders of
magnitude.
hist(log(london$realSum), prob = TRUE,
main = "Distribution of log(realSum)",
xlab = "log(realSum", col = "steelblue")
curve(dnorm(x, mean = mean(log(london$realSum)), sd = sd(log(london$realSum))),
col = "darkorange", add = TRUE, lwd = 3)

This does look somewhat better. The values are more spread out, and
roughly pproximate a normal distribution. Let us try to fit a model with
the log transformation of the response.
Function to perform diagnostics on functions with log
transformation of response
log_diag = function(model, name){
resid = exp(fitted(model)) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(model)$r.squared
coefs = nrow(summary(model)$coeff)
data.frame(model = name, rmse = rmse, r2 = r2, coefficients = coefs )
}
Fit a model with log transformation of response and all available
predictors
full_log_model = lm(log(realSum) ~ ., data = london)
summary(full_log_model)
##
## Call:
## lm(formula = log(realSum) ~ ., data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88702 -0.19398 -0.02584 0.16167 1.61895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.283176 9.131690 0.907 0.364416
## room_type -0.542775 0.011460 -47.364 < 2e-16 ***
## person_capacity3 0.108903 0.016334 6.667 2.94e-11 ***
## person_capacity4 0.208920 0.014630 14.281 < 2e-16 ***
## person_capacity5 0.240328 0.028254 8.506 < 2e-16 ***
## person_capacity6 0.313299 0.025983 12.058 < 2e-16 ***
## host_is_superhost 0.032884 0.013503 2.435 0.014920 *
## multi1 0.033008 0.011527 2.864 0.004209 **
## biz1 0.042135 0.012239 3.443 0.000581 ***
## cleanliness_rating7 -0.117363 0.037104 -3.163 0.001572 **
## cleanliness_rating8 -0.067925 0.031322 -2.169 0.030168 *
## cleanliness_rating9 -0.023287 0.031861 -0.731 0.464890
## cleanliness_rating10 0.067127 0.032769 2.048 0.040573 *
## guest_satisfaction_overall89 -0.033908 0.019386 -1.749 0.080354 .
## guest_satisfaction_overall95 -0.007139 0.021556 -0.331 0.740518
## guest_satisfaction_overall99 0.000939 0.024410 0.038 0.969317
## guest_satisfaction_overall100 0.042024 0.023303 1.803 0.071397 .
## bedrooms1 0.111007 0.017776 6.245 4.66e-10 ***
## bedrooms2 0.345054 0.023769 14.517 < 2e-16 ***
## bedrooms3 0.408546 0.045655 8.949 < 2e-16 ***
## bedrooms4 0.325236 0.111456 2.918 0.003541 **
## bedrooms5 0.257491 0.289905 0.888 0.374488
## dist 0.014206 0.005977 2.377 0.017509 *
## metro_dist -0.046627 0.011001 -4.238 2.30e-05 ***
## attr_index_norm 0.004772 0.001879 2.539 0.011138 *
## rest_index_norm 0.030463 0.003494 8.718 < 2e-16 ***
## lng -0.617254 0.086806 -7.111 1.35e-12 ***
## lat -0.050724 0.176934 -0.287 0.774367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.289 on 4214 degrees of freedom
## Multiple R-squared: 0.6892, Adjusted R-squared: 0.6872
## F-statistic: 346 on 27 and 4214 DF, p-value: < 2.2e-16
transform_results = log_diag(full_log_model, "full_log_model")
transform_results
## model rmse r2 coefficients
## 1 full_log_model 94.81446 0.6891645 28
The log transformation gives a better \(R^2\). The \(R^2\) value increased from around 0.6744026
with the linear model using all predictors to 0.6891645 with the log
transform. The log model also has similar RMSE.
Does the Box-Cox transformation work better than log transformation
of the response?
Inverse of Box-Cox to display results
invBoxCox = function(x, lambda)
if (lambda == 0) exp(x) else (lambda*x + 1)^(1/lambda)
Perform Box-Cox transform
library(MASS)
bc = boxcox(full_additive_mod)

(lambda = bc$x[which.max(bc$y)])
## [1] 0.06060606
bc_model = lm(((realSum^lambda-1)/lambda) ~ ., data = london)
resid = invBoxCox(fitted(bc_model), lambda) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(bc_model)$r.squared
coefs = nrow(summary(bc_model)$coeff)
row = data.frame(model = "bc_model", rmse = rmse, r2 = r2, coefficients = coefs )
row
## model rmse r2 coefficients
## 1 bc_model 94.51379 0.6909216 28
rbind(transform_results, row)
## model rmse r2 coefficients
## 1 full_log_model 94.81446 0.6891645 28
## 2 bc_model 94.51379 0.6909216 28
The Box-Cox transform did not help significantly.
We will now try transformations of other variables. Distance may have
a skewed distribution, with being very close to the city center being
more important.
hist(log(london$dist),
main = "Distribution of dist",
xlab = "dist", col = "steelblue",
prob = TRUE)

hist(sqrt(london$dist),
main = "Distribution of sqrt*dist)",
xlab = "sqrt(dist)", col = "steelblue", prob = TRUE)
curve(dnorm(x, mean = mean(sqrt(london$dist)), sd = sd(sqrt(london$dist))),
col = "darkorange", add = TRUE, lwd = 3)

The transform of distance may allow for a better model. We will try a
number of transformations.
Interaction Analysis
Next do a model with interactions, and then reduce the number of
variables with backward AIC. We will start with the full log model.
Fit a full interaction model
full_interact = lm(log(realSum) ~ .^2, data = london)
interact_results = log_diag(full_interact, "full_interact")
interact_results
## model rmse r2 coefficients
## 1 full_interact 87.61681 0.7339381 306
The full interaction model has a higher \(R^2\) value than any of the transformation
models above. We will have to check if there is overfitting. The
full_interact model has a significantly lower \(RMSE\) than the transformation models. The
\(RMSE\) is about 7 units lower for the
interaction model than the transformation models.
The full interaction model is quite large. Let us use AIC and BIC to
reduce the number of predictors.
AIC decreases the number of coefficients from 306 in the full
interaction model to 142 with a decrease in \(R^2\) from 0.7339381 to 0.7254884, and an
increase in \(RMSE\) from 87.61681 to
89.33073. This is with a decrease of on the order of 100 predictors.
Try BIC to get a smaller model
BIC decreases the number of coefficients from 306 in the full
interaction model to 36 with a decrease in \(R^2\) from 0.7339381 to 0.7038121, and an
increase in \(RMSE\) from 87.61681 to
92.78446. This is with a decrease of on the order of 157 predictors.
The AIC and BIC calculations take a tremendous amount of time to run,
so we presented the results as above.
Putting It Together
Try a model with transformations and interactions. Start with the
model from the BIC elimination model above and add the
sqrt(rest_index_norm) transformation.
Fit the combined model
smaller_combined_model = lm(log(realSum) ~ room_type + person_capacity + multi +
biz + cleanliness_rating + guest_satisfaction_overall + bedrooms +
dist + metro_dist + attr_index_norm + rest_index_norm + lng +
lat + room_type:multi + room_type:cleanliness_rating + biz:lat +
guest_satisfaction_overall:rest_index_norm + dist:lng + dist:lat +
metro_dist:lng + metro_dist:lat + attr_index_norm:rest_index_norm + sqrt(rest_index_norm), data = london)
combined_results = log_diag(smaller_combined_model, "smaller_combined_model")
combined_results
## model rmse r2 coefficients
## 1 smaller_combined_model 92.23517 0.7063175 43
Adding sqrt(rest_index_norm) doesn’t really change model
performance.
Let us remove predictors to increase explanatory power. I removed
predictors with collinearity and low p-values
smallest_combined_model = lm(log(realSum) ~ room_type + person_capacity + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london)
vif(smaller_combined_model)
## room_type
## 3.755532e+01
## person_capacity3
## 1.122164e+00
## person_capacity4
## 1.657643e+00
## person_capacity5
## 1.259893e+00
## person_capacity6
## 1.779930e+00
## multi1
## 1.300688e+01
## biz1
## 4.534500e+06
## cleanliness_rating7
## 2.030295e+01
## cleanliness_rating8
## 5.127873e+01
## cleanliness_rating9
## 8.281140e+01
## cleanliness_rating10
## 9.555969e+01
## guest_satisfaction_overall89
## 2.946576e+01
## guest_satisfaction_overall95
## 3.150349e+01
## guest_satisfaction_overall99
## 2.808703e+01
## guest_satisfaction_overall100
## 3.023714e+01
## bedrooms1
## 2.851566e+00
## bedrooms2
## 3.167665e+00
## bedrooms3
## 1.451119e+00
## bedrooms4
## 1.043989e+00
## bedrooms5
## 1.007863e+00
## dist
## 5.290257e+06
## metro_dist
## 3.009876e+06
## attr_index_norm
## 1.851396e+02
## rest_index_norm
## 2.379419e+03
## lng
## 1.736983e+01
## lat
## 1.958543e+01
## sqrt(rest_index_norm)
## 1.810220e+03
## room_type:multi1
## 1.335058e+01
## room_type:cleanliness_rating7
## 2.121854e+01
## room_type:cleanliness_rating8
## 5.694129e+01
## room_type:cleanliness_rating9
## 8.995277e+01
## room_type:cleanliness_rating10
## 1.151182e+02
## biz1:lat
## 4.534699e+06
## guest_satisfaction_overall89:rest_index_norm
## 3.137775e+01
## guest_satisfaction_overall95:rest_index_norm
## 2.902455e+01
## guest_satisfaction_overall99:rest_index_norm
## 2.267675e+01
## guest_satisfaction_overall100:rest_index_norm
## 2.407208e+01
## dist:lng
## 2.859645e+01
## dist:lat
## 5.282880e+06
## metro_dist:lng
## 8.679526e+00
## metro_dist:lat
## 3.005357e+06
## attr_index_norm:rest_index_norm
## 4.233686e+02
row = log_diag(smallest_combined_model, "smallest_combined_model")
row
## model rmse r2 coefficients
## 1 smallest_combined_model 95.76071 0.6817635 25
combined_results = rbind(combined_results, row)
The smaller combined model has much fewer predictors, but does suffer
in performance.
Overall comparison of models
sum = rbind(linear_exp_results, interact_results)
sum = rbind(sum, combined_results)
sum = rbind(sum, transform_results)
sum = sum[order(sum$rmse),]
sum
## model rmse r2 coefficients
## 4 full_interact 87.61681 0.7339381 306
## 5 smaller_combined_model 92.23517 0.7063175 43
## 1 full_additive_mod 93.93584 0.6744026 28
## 2 aic_full_additive_mod 93.94794 0.6743188 27
## 20 sqrt(rest_index_norm) 94.22083 0.6936952 29
## 18 rest_index_norm^2 94.22454 0.6935070 29
## 21 log(rest_index_norm) 94.24927 0.6935628 29
## 19 rest_index_norm^3 94.27339 0.6930946 29
## 15 sqrt(attr_index_norm) 94.35043 0.6915503 29
## 22 1/rest_index_norm 94.35254 0.6929745 29
## 16 log(attr_index_norm) 94.35644 0.6915419 29
## 13 attr_index_norm^2 94.40055 0.6911920 29
## 17 1/attr_index_norm 94.41537 0.6912526 29
## 14 attr_index_norm^3 94.45567 0.6908484 29
## 3 bic_full_additive_model 94.51618 0.6703671 19
## 7 poly 4 94.52507 0.6911600 31
## 24 lat^3 94.63057 0.6896452 29
## 23 lat^2 94.63060 0.6896451 29
## 27 1/lat 94.63070 0.6896447 29
## 28 lng^2 94.65548 0.6895303 29
## 61 1/dist 94.66025 0.6904071 29
## 51 log(dist) 94.70349 0.6900743 29
## 41 sqrt(dist) 94.72913 0.6898768 29
## 9 metro_dist^3 94.78832 0.6893279 29
## 8 metro_dist^2 94.79270 0.6892886 29
## 210 dist^2 94.79330 0.6893810 29
## 32 1/lng 94.79698 0.6894302 29
## 29 lng^3 94.79834 0.6891724 29
## 10 sqrt(metro_dist) 94.80382 0.6892254 29
## 11 log(metro_dist) 94.80783 0.6892024 29
## 33 dist^3 94.81241 0.6892170 29
## 12 1/metro_dist 94.81337 0.6891668 29
## 110 full_log_model 94.81446 0.6891645 28
## 25 sqrt(lat) 94.81446 0.6891645 28
## 26 log(lat) 94.81446 0.6891645 28
## 6 smallest_combined_model 95.76071 0.6817635 25
## 31 log(lng) 227.61012 0.9471637 26
## 30 sqrt(lng) 227.63476 0.9456222 26
LOOCV to Determine Overfitting
The mean average error goes up tremendously for the interaction
model, indicating overfitting. The above code is disabled as it takes
quite some time to run.
Assumption Analysis
Function to perform diagnostic tests of LINE assumptions
assumpt = function(model, pcol = "gray", lcol = "dodgerblue", alpha = 0.05){
par(mfrow=c(1,2))
plot(fitted(model), resid(model), col = pcol, pch = 20,
xlab = "Fitted", ylab = "Residuals",
main = paste("Fitted vs. Residuals for ", substitute(model), sep = ""))
abline(h = 0, col = lcol, lwd = 2)
qqnorm(resid(model), main = paste("Normal Q-Q Plot for ", substitute(model), sep = ""), col = pcol, pch = 20)
qqline(resid(model), col = lcol, lwd = 2)
}
assumpt(full_additive_mod)

assumpt(full_interact)

assumpt(smaller_combined_model)

assumpt(smallest_combined_model)

table = data.frame()
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
tab = data.frame(Model = "full_additive_mod", Shapiro = shapiro.test(resid(full_additive_mod)[1:4999])$p.value, BP = unname(bptest(full_additive_mod)$p.value))
row = data.frame(Model = "full_interact", Shapiro = shapiro.test(resid(full_interact)[1:4999])$p.value, BP = unname(bptest(full_interact)$p.value))
tab = rbind(tab, row)
row = data.frame(Model = "smaller_combined_model", Shapiro = shapiro.test(resid(smaller_combined_model)[1:4999])$p.value, BP = unname(bptest(smaller_combined_model)$p.value))
tab = rbind(tab, row)
row = data.frame(Model = "smallest_combined_model", Shapiro = shapiro.test(resid(smallest_combined_model)[1:4999])$p.value,
BP = unname(bptest(smaller_combined_model)$p.value))
tab = rbind(tab, row)
kable(tab)
| full_additive_mod |
0 |
0.00e+00 |
| full_interact |
0 |
5.78e-05 |
| smaller_combined_model |
0 |
0.00e+00 |
| smallest_combined_model |
0 |
0.00e+00 |
All of the BP and Shapiro tests have very small values, showing up as
zeros in the above table. We will see if removing unusual values helps
with this.
Removal of Unusual Values
In the above analysis, it looks as if the smaller_combined_model
works the best. We will now eliminate unusual values.
Remove unusual observations
unusual_index = cooks.distance(smaller_combined_model)>(4/nrow(london))
london_no_unusual = london[!unusual_index,]
london_no_unusual = na.omit(london_no_unusual)
nrow(london) - nrow(london_no_unusual)
## [1] 192
(nrow(london) - nrow(london_no_unusual))/nrow(london)
## [1] 0.04526167
max(london_no_unusual$realSum)
## [1] 833.0393
Removing points with a cooks distance > 4/n eliminates 192 points,
or 4.5% of the total number of rows.
library(lmtest)
smaller_combined_no_unusual = lm(log(realSum) ~ room_type + person_capacity + multi +
biz + cleanliness_rating + guest_satisfaction_overall + bedrooms +
dist + metro_dist + attr_index_norm + rest_index_norm + lng +
lat + room_type:multi + room_type:cleanliness_rating + biz:lat +
guest_satisfaction_overall:rest_index_norm + dist:lng + dist:lat +
metro_dist:lng + metro_dist:lat + attr_index_norm:rest_index_norm, data = london_no_unusual)
log_diag(smaller_combined_no_unusual, "smaller_combined_no_unusual")
## Warning in exp(fitted(model)) - london$realSum: longer object length is not a
## multiple of shorter object length
## model rmse r2 coefficients
## 1 smaller_combined_no_unusual 212.4507 0.7770377 41
assumpt(smaller_combined_no_unusual)

plot(smaller_combined_no_unusual)


shapiro.test(resid(smaller_combined_no_unusual)[1:4980])
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_combined_no_unusual)[1:4980]
## W = 0.99612, p-value = 8.544e-09
bptest(smaller_combined_no_unusual)
##
## studentized Breusch-Pagan test
##
## data: smaller_combined_no_unusual
## BP = 181.38, df = 40, p-value < 2.2e-16
Fitting a model that leaves out unusual observations increases \(R^2\), but also increases \(RMSE\) significantly.
Model Evaluation for Overfitting
sample = sample(c(TRUE, FALSE), nrow(london_no_unusual), replace=TRUE, prob=c(0.7,0.3))
london_train = london[sample, ]
london_test = london[!sample, ]
Diagnostics for test train split
log_predict_diag = function(true_data, fit_data, model, dataset){
resid = exp(unname(fit_data)) - unname(true_data)
rmse = sqrt(sum(resid^2)/length(fit_data))
data.frame(model = model, dataset = dataset, rmse = rmse)
}
Check the smaller_combined_train model for overfitting
smaller_combined_train = lm(log(realSum) ~ room_type + person_capacity + multi +
biz + cleanliness_rating + guest_satisfaction_overall + bedrooms +
dist + metro_dist + attr_index_norm + rest_index_norm + lng +
lat + room_type:multi + room_type:cleanliness_rating + biz:lat +
guest_satisfaction_overall:rest_index_norm + dist:lng + dist:lat +
metro_dist:lng + metro_dist:lat + attr_index_norm:rest_index_norm, data = london_train)
test_fit = predict(smaller_combined_train, london_test)
eval_results = log_predict_diag(london_test$realSum, test_fit, "smaller_combined_model", "test")
eval_results
## model dataset rmse
## 1 smaller_combined_model test 92.41605
train_fit = predict(smaller_combined_train, london_train)
row = log_predict_diag(london_train$realSum, train_fit, "smaller_combined_model", "training")
row
## model dataset rmse
## 1 smaller_combined_model training 92.73717
eval_results = rbind(eval_results, row)
plot(exp(unname(test_fit)) ~ unname(london_test$realSum))
abline(1,1)

Check the additive model for overfitting
full_additive_train = lm(log(realSum) ~ ., data = london_train)
test_fit = predict(full_additive_train, london_test)
row = log_predict_diag(london_test$realSum, test_fit, "full_log_model", "test")
row
## model dataset rmse
## 1 full_log_model test 94.85646
eval_results = rbind(eval_results, row)
train_fit = predict(full_additive_train, london_train)
row = log_predict_diag(london_train$realSum, train_fit, "full_log_model", "training")
row
## model dataset rmse
## 1 full_log_model training 95.04888
eval_results = rbind(eval_results, row)
Check the full interaction model for overfitting
full_interact_train = lm(log(realSum) ~ .^2, data = london_train)
test_fit = predict(full_interact_train, london_test)
## Warning in predict.lm(full_interact_train, london_test): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
row = log_predict_diag(london_test$realSum, test_fit, "full_interact_no_unusual_train", "test")
row
## model dataset rmse
## 1 full_interact_no_unusual_train test 97.73329
eval_results = rbind(eval_results, row)
train_fit = predict(full_interact_train, london_train)
## Warning in predict.lm(full_interact_train, london_train): prediction from
## rank-deficient fit; attr(*, "non-estim") has doubtful cases
row = log_predict_diag(london_train$realSum, train_fit, "full_interact_no_unusual_train", "training")
row
## model dataset rmse
## 1 full_interact_no_unusual_train training 86.96632
eval_results = rbind(eval_results, row)
kable(eval_results)
| smaller_combined_model |
test |
92.41605 |
| smaller_combined_model |
training |
92.73717 |
| full_log_model |
test |
94.85646 |
| full_log_model |
training |
95.04888 |
| full_interact_no_unusual_train |
test |
97.73329 |
| full_interact_no_unusual_train |
training |
86.96632 |
As we can see above, the full interaction model exhibits significant
overfitting, with an increase in RMSE test and train data more than seen
in the other models. Please see the table above.
Results
We used multiple linear regression with a simple fully additive model
(full_additive_mod) which provided for a \(RMSE\) of 93.93584 and an \(R^2\) value of 0.6744026 when predicting
Airbnb room rates (response, realSum). This model used dummy variables
for categorical predictors such as room_type and multi. The mean of
realSum for our dataset is 310.8121. We tried a number of methods to
optimise the number of parameters of this model such as AIC, and BIC
backward selection starting with the additive model as the base model.
BIC decreased the number of predictors from 28 to 19 with an increase in
RMSE of 94.51618 and a decrease in \(R^2\) to 0.6703671. These differences in
performance are relatively small, and therefore we continued with the
BIC model for further analysis.
Looking at the pairs plot in the exploratory data analysis section,
the relationship between many of the variables appears nonlinear,
indicating that the addition of predictor interactions and
transformations may be helpful. A slight improvement in our model was
afforded by the log transform of realSum. This model (full_log_model)
had an slightly improved \(R^2\) of
0.6891645, but a slightly higher \(RMSE\) of 94.81446.
plot(fitted(full_additive_mod), london$realSum,
xlab = "Predicted Price",
ylab = "True Price",
main = "Correlation for Full Additive Model",
col = "darkorange")
abline(1,1)

plot(exp(fitted(full_log_model)), london$realSum,
xlab = "Predicted Price",
ylab = "True Price",
main = "Correlation for Full Log Model",
col = "darkorange")
abline(1,1)

As you can see from the plots above, for both models provide for a
fair fit to the data , although there are some prices that are
under-predicted by the model.
We tried a number of other transformations, including Box-Cox, among
others, none of which had much of an impact on the apparent fit of the
model.
Next, we tried the full interaction model. This improved the \(R^2\) to 0.7339381, decreased the \(RMSE\) to 87.61681. This model also showed
evidence of overfitting, as the RMSE went up significantly when we
compared these metrics for test and train data. The RMSE went from
87.65217 when predictions were made using training data to 94.34445
using test data. In the end, we used BIC to reduce the full interaction
model to a more tractable model, smaller_combined_model. This
smaller_combined_model was the mainstay of the rest of our analysis.
Our suspicion grew that unusual observations may be decreasing model
performance. This can be seen in residuals vs. fitted plot for points
1506, 4518, and 1375, for example.
smaller_combined_model = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london)
plot(smaller_combined_model)
## Warning: not plotting observations with leverage one:
## 168




4/nrow(london)
## [1] 0.0009429514
In the end, we eliminated 192 rows of data that had a cook’s distance
of over 4/n. This did not improve fit significantly..
One difficulty that we had was assuring ourselves that our model did
not violate the assumptions of constant variance and normality. Our best
model, smaller_combined_no_unusual_train, did look to have a fairly good
Q-Q plot and Predicted vs. Residuals plot.
assumpt(smaller_combined_no_unusual)

This model did not pass the Shapiro-Wilke test, but apparently this
test does not work with large sample sizes, leading to erroneous
rejection of the null hypothesis of normal variance. This model did not
improve prediction.
In the end our final model:
smaller_combined_model$call
## lm(formula = log(realSum) ~ room_type + person_capacity + cleanliness_rating +
## guest_satisfaction_overall + bedrooms + dist + attr_index_norm +
## lng + +multi:guest_satisfaction_overall + biz:cleanliness_rating +
## cleanliness_rating:guest_satisfaction_overall + dist:rest_index_norm +
## metro_dist:attr_index_norm, data = london)
summary(smaller_combined_model)
##
## Call:
## lm(formula = log(realSum) ~ room_type + person_capacity + cleanliness_rating +
## guest_satisfaction_overall + bedrooms + dist + attr_index_norm +
## lng + +multi:guest_satisfaction_overall + biz:cleanliness_rating +
## cleanliness_rating:guest_satisfaction_overall + dist:rest_index_norm +
## metro_dist:attr_index_norm, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86405 -0.19408 -0.02471 0.16538 1.55883
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error
## (Intercept) 5.5990305 0.0839572
## room_type -0.5323710 0.0115668
## person_capacity3 0.1081570 0.0163100
## person_capacity4 0.2118671 0.0146068
## person_capacity5 0.2453332 0.0282497
## person_capacity6 0.3125892 0.0260065
## cleanliness_rating7 -0.1478547 0.0683821
## cleanliness_rating8 -0.1061765 0.0530661
## cleanliness_rating9 -0.0015624 0.0577385
## cleanliness_rating10 0.0496853 0.0698509
## guest_satisfaction_overall89 -0.2051742 0.0851169
## guest_satisfaction_overall95 -0.0462691 0.0574068
## guest_satisfaction_overall99 0.0026430 0.0577060
## guest_satisfaction_overall100 -0.0611435 0.2066237
## bedrooms1 0.1236607 0.0178590
## bedrooms2 0.3535167 0.0238231
## bedrooms3 0.4259153 0.0456580
## bedrooms4 0.3298976 0.1111963
## bedrooms5 0.2534613 0.2896622
## dist -0.0143356 0.0056271
## attr_index_norm 0.0154687 0.0013623
## lng -0.7231590 0.0792229
## guest_satisfaction_overall79:multi1 0.0228244 0.0449469
## guest_satisfaction_overall89:multi1 0.0061099 0.0257666
## guest_satisfaction_overall95:multi1 0.0214344 0.0206485
## guest_satisfaction_overall99:multi1 0.0131803 0.0211427
## guest_satisfaction_overall100:multi1 0.0696903 0.0209854
## cleanliness_rating6:biz1 0.0113926 0.0586974
## cleanliness_rating7:biz1 -0.0007260 0.0573620
## cleanliness_rating8:biz1 -0.0065908 0.0288453
## cleanliness_rating9:biz1 0.0051997 0.0195708
## cleanliness_rating10:biz1 0.0783980 0.0171228
## cleanliness_rating7:guest_satisfaction_overall89 0.1746772 0.1011630
## cleanliness_rating8:guest_satisfaction_overall89 0.2185717 0.0887036
## cleanliness_rating9:guest_satisfaction_overall89 0.1267715 0.0924061
## cleanliness_rating10:guest_satisfaction_overall89 0.1791003 0.1036366
## cleanliness_rating7:guest_satisfaction_overall95 0.2209061 0.1301632
## cleanliness_rating8:guest_satisfaction_overall95 0.0114442 0.0760178
## cleanliness_rating9:guest_satisfaction_overall95 0.0047679 0.0688641
## cleanliness_rating10:guest_satisfaction_overall95 NA NA
## cleanliness_rating7:guest_satisfaction_overall99 0.5323214 0.2187831
## cleanliness_rating8:guest_satisfaction_overall99 0.0907735 0.1345552
## cleanliness_rating9:guest_satisfaction_overall99 -0.0735795 0.0727053
## cleanliness_rating10:guest_satisfaction_overall99 NA NA
## cleanliness_rating7:guest_satisfaction_overall100 NA NA
## cleanliness_rating8:guest_satisfaction_overall100 0.1295837 0.2128966
## cleanliness_rating9:guest_satisfaction_overall100 0.0778805 0.2124132
## cleanliness_rating10:guest_satisfaction_overall100 0.0640044 0.2136199
## dist:rest_index_norm 0.0068528 0.0006933
## attr_index_norm:metro_dist -0.0020254 0.0007039
## t value Pr(>|t|)
## (Intercept) 66.689 < 2e-16 ***
## room_type -46.026 < 2e-16 ***
## person_capacity3 6.631 3.75e-11 ***
## person_capacity4 14.505 < 2e-16 ***
## person_capacity5 8.684 < 2e-16 ***
## person_capacity6 12.020 < 2e-16 ***
## cleanliness_rating7 -2.162 0.030660 *
## cleanliness_rating8 -2.001 0.045475 *
## cleanliness_rating9 -0.027 0.978413
## cleanliness_rating10 0.711 0.476934
## guest_satisfaction_overall89 -2.410 0.015974 *
## guest_satisfaction_overall95 -0.806 0.420297
## guest_satisfaction_overall99 0.046 0.963472
## guest_satisfaction_overall100 -0.296 0.767308
## bedrooms1 6.924 5.05e-12 ***
## bedrooms2 14.839 < 2e-16 ***
## bedrooms3 9.328 < 2e-16 ***
## bedrooms4 2.967 0.003026 **
## bedrooms5 0.875 0.381611
## dist -2.548 0.010882 *
## attr_index_norm 11.355 < 2e-16 ***
## lng -9.128 < 2e-16 ***
## guest_satisfaction_overall79:multi1 0.508 0.611615
## guest_satisfaction_overall89:multi1 0.237 0.812571
## guest_satisfaction_overall95:multi1 1.038 0.299302
## guest_satisfaction_overall99:multi1 0.623 0.533058
## guest_satisfaction_overall100:multi1 3.321 0.000905 ***
## cleanliness_rating6:biz1 0.194 0.846115
## cleanliness_rating7:biz1 -0.013 0.989902
## cleanliness_rating8:biz1 -0.228 0.819278
## cleanliness_rating9:biz1 0.266 0.790494
## cleanliness_rating10:biz1 4.579 4.82e-06 ***
## cleanliness_rating7:guest_satisfaction_overall89 1.727 0.084297 .
## cleanliness_rating8:guest_satisfaction_overall89 2.464 0.013777 *
## cleanliness_rating9:guest_satisfaction_overall89 1.372 0.170169
## cleanliness_rating10:guest_satisfaction_overall89 1.728 0.084034 .
## cleanliness_rating7:guest_satisfaction_overall95 1.697 0.089743 .
## cleanliness_rating8:guest_satisfaction_overall95 0.151 0.880341
## cleanliness_rating9:guest_satisfaction_overall95 0.069 0.944805
## cleanliness_rating10:guest_satisfaction_overall95 NA NA
## cleanliness_rating7:guest_satisfaction_overall99 2.433 0.015012 *
## cleanliness_rating8:guest_satisfaction_overall99 0.675 0.499955
## cleanliness_rating9:guest_satisfaction_overall99 -1.012 0.311585
## cleanliness_rating10:guest_satisfaction_overall99 NA NA
## cleanliness_rating7:guest_satisfaction_overall100 NA NA
## cleanliness_rating8:guest_satisfaction_overall100 0.609 0.542776
## cleanliness_rating9:guest_satisfaction_overall100 0.367 0.713901
## cleanliness_rating10:guest_satisfaction_overall100 0.300 0.764483
## dist:rest_index_norm 9.884 < 2e-16 ***
## attr_index_norm:metro_dist -2.877 0.004032 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2881 on 4195 degrees of freedom
## Multiple R-squared: 0.6925, Adjusted R-squared: 0.6891
## F-statistic: 205.3 on 46 and 4195 DF, p-value: < 2.2e-16
seemed optimal in terms providing low RMSE, having a reasonable
number of predictors, and good \(R^2\).
The main predictors, including room_type, person_capacity,
cleanliness_rating, guest_satisfaction, and bedrooms seem like things
that would impact the cost of a room. Also, distance to the city and
attractions would make a room more desirable. The interactions make
sense as well, implying synergies between the “distance from city center
to restaurants” and “cleanliness rating and satisfaction overall”.